From fb226ca216d418c1c6ceaf9871a4ef72e1fe7af6 Mon Sep 17 00:00:00 2001 From: "mafetter@fleming.research" Date: Mon, 21 Mar 2005 12:01:36 +0000 Subject: [PATCH] bitkeeper revision 1.1236.32.14 (423eb7a0HqJL37tAErMbIXIQw6Q3Jg) Added prediction of where to find the last writable PTE for a given page; greatly speeds up promotion of a page to be used as a page table. Removed some broken concepts of write protecting PDEs and higher level entries. To write protect a page, all we need to do is write protect all L1 entries that point at it. Fixed a bug with translated IO pages; gotta check that MFNs are really backed by RAM before we go looking in the frame_table for them... Signed-off-by: michael.fetterman@cl.cam.ac.uk --- xen/arch/x86/audit.c | 12 +- xen/arch/x86/mm.c | 2 +- xen/arch/x86/shadow.c | 237 +++++++++++++++++++++++------------ xen/include/asm-x86/mm.h | 9 +- xen/include/asm-x86/shadow.h | 94 +++++++++----- xen/include/xen/perfc.h | 1 + xen/include/xen/perfc_defn.h | 7 +- 7 files changed, 248 insertions(+), 114 deletions(-) diff --git a/xen/arch/x86/audit.c b/xen/arch/x86/audit.c index ed3d6dae8c..5aaebd936d 100644 --- a/xen/arch/x86/audit.c +++ b/xen/arch/x86/audit.c @@ -333,22 +333,26 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) smfn = a->smfn; page = &frame_table[smfn]; - adjust(pfn_to_page(gmfn), 0); - switch ( a->gpfn_and_flags & PGT_type_mask ) { + case PGT_writable_pred: + break; case PGT_snapshot: + adjust(pfn_to_page(gmfn), 0); break; case PGT_l1_shadow: + adjust(pfn_to_page(gmfn), 0); adjust_l1_page(smfn); if ( page->u.inuse.type_info & PGT_pinned ) adjust(page, 0); break; case PGT_hl2_shadow: + adjust(pfn_to_page(gmfn), 0); adjust_hl2_page(smfn); if ( page->u.inuse.type_info & PGT_pinned ) adjust(page, 0); break; case PGT_l2_shadow: + adjust(pfn_to_page(gmfn), 0); adjust_l2_page(smfn); if ( page->u.inuse.type_info & PGT_pinned ) adjust(page, 0); @@ -619,6 +623,7 @@ void _audit_domain(struct domain *d, int flags) scan_for_pfn_in_mfn(d, xmfn, a->smfn); break; case PGT_snapshot: + case PGT_writable_pred: break; default: BUG(); @@ -835,6 +840,9 @@ void _audit_domain(struct domain *d, int flags) errors++; } break; + case PGT_writable_pred: + // XXX - nothing to check? + break; default: BUG(); diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 78634bedeb..6988f38cc4 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -268,7 +268,7 @@ int map_ldt_shadow_page(unsigned int off) if ( unlikely(shadow_mode_enabled(d)) ) { shadow_lock(d); - shadow_remove_all_write_access(d, PGT_l1_shadow, PGT_l1_shadow, gpfn, gmfn); + shadow_remove_all_write_access(d, gpfn, gmfn); } res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page); diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c index ca14c5aa9a..adffaec447 100644 --- a/xen/arch/x86/shadow.c +++ b/xen/arch/x86/shadow.c @@ -48,7 +48,6 @@ static inline int shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn, unsigned long new_type) { - unsigned long min_type, max_type; struct pfn_info *page = pfn_to_page(gmfn); int pinned = 0, okay = 1; @@ -61,20 +60,11 @@ shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn, } if ( unlikely(page_is_page_table(page)) ) - { - min_type = shadow_max_pgtable_type(d, gpfn) + PGT_l1_shadow; - max_type = new_type; - } - else - { - min_type = PGT_l1_shadow; - max_type = PGT_l1_shadow; - } - FSH_LOG("shadow_promote gpfn=%p gmfn=%p nt=%p min=%p max=%p", - gpfn, gmfn, new_type, min_type, max_type); + return 1; - if ( (min_type <= max_type) && - !shadow_remove_all_write_access(d, min_type, max_type, gpfn, gmfn) ) + FSH_LOG("shadow_promote gpfn=%p gmfn=%p nt=%p", gpfn, gmfn, new_type); + + if ( !shadow_remove_all_write_access(d, gpfn, gmfn) ) return 0; // To convert this page to use as a page table, the writable count @@ -1737,114 +1727,192 @@ int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va) return 0; } +#define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t))) +static inline unsigned long +predict_writable_pte_page(struct domain *d, unsigned long gpfn) +{ + return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred); +} + +static inline void +increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction) +{ + unsigned long score = prediction & PGT_score_mask; + int create = (score == 0); + + // saturating addition + score = (score + (1u << PGT_score_shift)) & PGT_score_mask; + score = score ? score : PGT_score_mask; + + prediction = (prediction & PGT_mfn_mask) | score; + + //printk("increase gpfn=%p pred=%p create=%d\n", gpfn, prediction, create); + set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred); + + if ( create ) + perfc_incr(writable_pte_predictions); +} + +static inline void +decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction) +{ + unsigned long score = prediction & PGT_score_mask; + ASSERT(score); + + // divide score by 2... We don't like bad predictions. + // + score = (score >> 1) & PGT_score_mask; + + prediction = (prediction & PGT_mfn_mask) | score; + + //printk("decrease gpfn=%p pred=%p score=%p\n", gpfn, prediction, score); + + if ( score ) + set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred); + else + { + delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred); + perfc_decr(writable_pte_predictions); + } +} + static u32 remove_all_write_access_in_ptpage( - struct domain *d, unsigned long pt_mfn, unsigned long readonly_mfn) + struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn, + unsigned long readonly_gpfn, unsigned long readonly_gmfn, + u32 max_refs_to_find, unsigned long prediction) { unsigned long *pt = map_domain_mem(pt_mfn << PAGE_SHIFT); unsigned long match = - (readonly_mfn << PAGE_SHIFT) | _PAGE_RW | _PAGE_PRESENT; + (readonly_gmfn << PAGE_SHIFT) | _PAGE_RW | _PAGE_PRESENT; unsigned long mask = PAGE_MASK | _PAGE_RW | _PAGE_PRESENT; int i; - u32 count = 0; + u32 found = 0; int is_l1_shadow = ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) == PGT_l1_shadow); - for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) +#define MATCH_ENTRY(_i) (((pt[_i] ^ match) & mask) == 0) + + // returns true if all refs have been found and fixed. + // + int fix_entry(int i) { - if ( unlikely(((pt[i] ^ match) & mask) == 0) ) - { - unsigned long old = pt[i]; - unsigned long new = old & ~_PAGE_RW; + unsigned long old = pt[i]; + unsigned long new = old & ~_PAGE_RW; - if ( is_l1_shadow && - !shadow_get_page_from_l1e(mk_l1_pgentry(new), d) ) - BUG(); + if ( is_l1_shadow && !shadow_get_page_from_l1e(mk_l1_pgentry(new), d) ) + BUG(); + found++; + pt[i] = new; + if ( is_l1_shadow ) + put_page_from_l1e(mk_l1_pgentry(old), d); - count++; - pt[i] = new; +#if 0 + printk("removed write access to pfn=%p mfn=%p in smfn=%p entry %x " + "is_l1_shadow=%d\n", + readonly_gpfn, readonly_gmfn, pt_mfn, i, is_l1_shadow); +#endif - if ( is_l1_shadow ) - put_page_from_l1e(mk_l1_pgentry(old), d); + return (found == max_refs_to_find); + } - FSH_LOG("removed write access to mfn=%p in smfn=%p entry %x " - "is_l1_shadow=%d", - readonly_mfn, pt_mfn, i, is_l1_shadow); - } + if ( MATCH_ENTRY(readonly_gpfn & (L1_PAGETABLE_ENTRIES - 1)) && + fix_entry(readonly_gpfn & (L1_PAGETABLE_ENTRIES - 1)) ) + { + perfc_incrc(remove_write_fast_exit); + increase_writable_pte_prediction(d, readonly_gpfn, prediction); + unmap_domain_mem(pt); + return found; + } + + for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) + { + if ( unlikely(MATCH_ENTRY(i)) && fix_entry(i) ) + break; } unmap_domain_mem(pt); - return count; + return found; +#undef MATCH_ENTRY } int shadow_remove_all_write_access( - struct domain *d, unsigned min_type, unsigned max_type, - unsigned long gpfn, unsigned long gmfn) + struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn) { int i; struct shadow_status *a; - unsigned long sl1mfn = __shadow_status(d, gpfn, PGT_l1_shadow); - u32 count = 0; - u32 write_refs; + u32 found = 0, fixups, write_refs; + unsigned long prediction, predicted_gpfn, predicted_smfn; ASSERT(spin_is_locked(&d->arch.shadow_lock)); - ASSERT(gmfn); + ASSERT(VALID_MFN(readonly_gmfn)); perfc_incrc(remove_write_access); - if ( (frame_table[gmfn].u.inuse.type_info & PGT_type_mask) == + // If it's not a writable page, then no writable refs can be outstanding. + // + if ( (frame_table[readonly_gmfn].u.inuse.type_info & PGT_type_mask) != PGT_writable_page ) { - write_refs = (frame_table[gmfn].u.inuse.type_info & PGT_count_mask); - if ( write_refs && - (frame_table[gmfn].u.inuse.type_info & PGT_pinned) ) - write_refs--; - if ( write_refs == 0 ) + perfc_incrc(remove_write_not_writable); + return 1; + } + + // How many outstanding writable PTEs for this page are there? + // + write_refs = (frame_table[readonly_gmfn].u.inuse.type_info & PGT_count_mask); + if ( write_refs && (frame_table[readonly_gmfn].u.inuse.type_info & PGT_pinned) ) + write_refs--; + + if ( write_refs == 0 ) + { + perfc_incrc(remove_write_no_work); + return 1; + } + + // Before searching all the L1 page tables, check the typical culprit first. + // + if ( (prediction = predict_writable_pte_page(d, readonly_gpfn)) ) + { + predicted_gpfn = prediction & PGT_mfn_mask; + if ( (predicted_smfn = __shadow_status(d, predicted_gpfn, PGT_l1_shadow)) && + (fixups = remove_all_write_access_in_ptpage(d, predicted_gpfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, prediction)) ) + { + found += fixups; + if ( found == write_refs ) + { + perfc_incrc(remove_write_predicted); + return 1; + } + } + else { - perfc_incrc(remove_write_access_easy); - return 1; + perfc_incrc(remove_write_bad_prediction); + decrease_writable_pte_prediction(d, readonly_gpfn, prediction); } } + // Search all the shadow L1 page tables... + // for (i = 0; i < shadow_ht_buckets; i++) { a = &d->arch.shadow_ht[i]; while ( a && a->gpfn_and_flags ) { - if ( ((a->gpfn_and_flags & PGT_type_mask) >= min_type) && - ((a->gpfn_and_flags & PGT_type_mask) <= max_type) ) + if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow ) { - switch ( a->gpfn_and_flags & PGT_type_mask ) - { - case PGT_l1_shadow: - count += - remove_all_write_access_in_ptpage(d, a->smfn, gmfn); - if ( count == write_refs ) - return 1; - break; - case PGT_l2_shadow: - if ( sl1mfn ) - count += - remove_all_write_access_in_ptpage(d, a->smfn, - sl1mfn); - if ( count == write_refs ) - return 1; - break; - case PGT_hl2_shadow: - // nothing to do here... - break; - default: - // need to flush this out for 4 level page tables. - BUG(); - } + found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask); + if ( found == write_refs ) + return 1; } + a = a->next; } } FSH_LOG("%s: looking for %d refs, found %d refs\n", - __func__, write_refs, count); + __func__, write_refs, found); return 0; } @@ -1881,7 +1949,7 @@ static u32 remove_all_access_in_page( return count; } -u32 shadow_remove_all_access(struct domain *d, unsigned long gmfn) +u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn) { int i; struct shadow_status *a; @@ -1894,11 +1962,23 @@ u32 shadow_remove_all_access(struct domain *d, unsigned long gmfn) a = &d->arch.shadow_ht[i]; while ( a && a->gpfn_and_flags ) { - if ( ((a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow) || - ((a->gpfn_and_flags & PGT_type_mask) == PGT_hl2_shadow) ) + switch (a->gpfn_and_flags & PGT_type_mask) { - count += remove_all_access_in_page(d, a->smfn, gmfn); + case PGT_l1_shadow: + case PGT_l2_shadow: + case PGT_l3_shadow: + case PGT_l4_shadow: + case PGT_hl2_shadow: + count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn); + break; + case PGT_snapshot: + case PGT_writable_pred: + // these can't hold refs to the forbidden page + break; + default: + BUG(); } + a = a->next; } } @@ -2659,6 +2739,7 @@ int _check_all_pagetables(struct exec_domain *ed, char *s) BUG(); // XXX - ought to fix this... break; case PGT_snapshot: + case PGT_writable_ref: break; default: errors++; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 3288b2d4d1..8ddee7e254 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -76,6 +76,7 @@ struct pfn_info #define PGT_l4_shadow PGT_l4_page_table #define PGT_hl2_shadow (5<<29) #define PGT_snapshot (6<<29) +#define PGT_writable_pred (7<<29) /* predicted gpfn with writable ref */ #define PGT_type_mask (7<<29) /* Bits 29-31. */ @@ -95,7 +96,10 @@ struct pfn_info /* 17-bit count of uses of this frame as its current type. */ #define PGT_count_mask ((1U<<17)-1) -#define PGT_mfn_mask ((1U<<21)-1) /* mfn mask for shadow types */ +#define PGT_mfn_mask ((1U<<20)-1) /* mfn mask for shadow types */ + +#define PGT_score_shift 20 +#define PGT_score_mask (((1U<<4)-1)< 1) ) { - printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d\n", + printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d writable_ptes=%d\n", live, free, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages), perfc_value(hl2_table_pages), - perfc_value(snapshot_pages)); + perfc_value(snapshot_pages), + perfc_value(writable_pte_predictions)); BUG(); } #endif @@ -941,13 +947,22 @@ static inline unsigned long __shadow_status( ASSERT(gpfn == (gpfn & PGT_mfn_mask)); ASSERT(stype && !(stype & ~PGT_type_mask)); - if ( VALID_MFN(gmfn) && - ((stype != PGT_snapshot) - ? !mfn_is_page_table(gmfn) - : !mfn_out_of_sync(gmfn)) ) + if ( VALID_MFN(gmfn) && (gmfn < max_page) && + (stype != PGT_writable_pred) && + ((stype == PGT_snapshot) + ? !mfn_out_of_sync(gmfn) + : !mfn_is_page_table(gmfn)) ) { perfc_incrc(shadow_status_shortcut); +#ifndef NDEBUG ASSERT(___shadow_status(d, gpfn, stype) == 0); + + // Undo the affects of the above ASSERT on ___shadow_status()'s perf + // counters. + // + perfc_decrc(shadow_status_calls); + perfc_decrc(shadow_status_miss); +#endif return 0; } @@ -978,21 +993,26 @@ shadow_max_pgtable_type(struct domain *d, unsigned long gpfn) { type = x->gpfn_and_flags & PGT_type_mask; - // Treat an HL2 as if it's an L1 - // - if ( type == PGT_hl2_shadow ) + switch ( type ) + { + case PGT_hl2_shadow: + // Treat an HL2 as if it's an L1 + // type = PGT_l1_shadow; - - // Ignore snapshots -- they don't in and of themselves constitute - // treating a page as a page table - // - if ( type == PGT_snapshot ) + break; + case PGT_snapshot: + case PGT_writable_pred: + // Ignore snapshots -- they don't in and of themselves constitute + // treating a page as a page table + // goto next; - - // Early exit if we found the max possible value - // - if ( type == PGT_base_page_table ) + case PGT_base_page_table: + // Early exit if we found the max possible value + // return type; + default: + break; + } if ( type > pttype ) pttype = type; @@ -1116,7 +1136,8 @@ static inline void delete_shadow_status( found: // release ref to page - put_page(pfn_to_page(gmfn)); + if ( stype != PGT_writable_pred ) + put_page(pfn_to_page(gmfn)); shadow_audit(d, 0); } @@ -1129,15 +1150,16 @@ static inline void set_shadow_status( int i; unsigned long key = gpfn | stype; - SH_VVLOG("set gpfn=%p gmfn=%p smfn=%p t=%p\n", gpfn, gmfn, smfn, stype); + SH_VVLOG("set gpfn=%p gmfn=%p smfn=%p t=%p", gpfn, gmfn, smfn, stype); ASSERT(spin_is_locked(&d->arch.shadow_lock)); ASSERT(shadow_mode_translate(d) || gpfn); ASSERT(!(gpfn & ~PGT_mfn_mask)); - - ASSERT(pfn_is_ram(gmfn)); // XXX need to be more graceful - ASSERT(smfn && !(smfn & ~PGT_mfn_mask)); + + // XXX - need to be more graceful. + ASSERT(VALID_MFN(gmfn)); + ASSERT(stype && !(stype & ~PGT_type_mask)); x = head = hash_bucket(d, gpfn); @@ -1149,17 +1171,24 @@ static inline void set_shadow_status( // grab a reference to the guest page to represent the entry in the shadow // hash table // - get_page(pfn_to_page(gmfn), d); + // XXX - Should PGT_writable_pred grab a page ref? + // - Who/how are these hash table entry refs flushed if/when a page + // is given away by the domain? + // + if ( stype != PGT_writable_pred ) + get_page(pfn_to_page(gmfn), d); /* * STEP 1. If page is already in the table, update it in place. */ do { - if ( x->gpfn_and_flags == key ) + if ( unlikely(x->gpfn_and_flags == key) ) { - BUG(); + if ( stype != PGT_writable_pred ) + BUG(); // we should never replace entries into the hash table x->smfn = smfn; + put_page(pfn_to_page(gmfn)); // already had a ref... goto done; } @@ -1221,6 +1250,13 @@ static inline void set_shadow_status( done: shadow_audit(d, 0); + + if ( stype <= PGT_l4_shadow ) + { + // add to front of list of pages to check when removing write + // permissions for a page... + // + } } /************************************************************************/ diff --git a/xen/include/xen/perfc.h b/xen/include/xen/perfc.h index f954a95f39..323b83e860 100644 --- a/xen/include/xen/perfc.h +++ b/xen/include/xen/perfc.h @@ -65,6 +65,7 @@ extern struct perfcounter perfcounters; #define perfc_incr(x) atomic_inc(&perfcounters.x[0]) #define perfc_decr(x) atomic_dec(&perfcounters.x[0]) #define perfc_incrc(x) atomic_inc(&perfcounters.x[smp_processor_id()]) +#define perfc_decrc(x) atomic_dec(&perfcounters.x[smp_processor_id()]) #define perfc_incra(x,y) \ { if(y<(sizeof(perfcounters.x)/sizeof(*perfcounters.x))) \ atomic_inc(&perfcounters.x[y]); } diff --git a/xen/include/xen/perfc_defn.h b/xen/include/xen/perfc_defn.h index c7e3707bb0..5a8a09fa56 100644 --- a/xen/include/xen/perfc_defn.h +++ b/xen/include/xen/perfc_defn.h @@ -38,6 +38,7 @@ PERFSTATUS( shadow_l2_pages, "current # shadow L2 pages" ) PERFSTATUS( shadow_l1_pages, "current # shadow L1 pages" ) PERFSTATUS( hl2_table_pages, "current # hl2 pages" ) PERFSTATUS( snapshot_pages, "current # fshadow snapshot pages" ) +PERFSTATUS( writable_pte_predictions, "# writable pte predictions") PERFCOUNTER_CPU(shadow_status_shortcut, "fastpath miss on shadow cache") PERFCOUNTER_CPU(shadow_status_calls, "calls to ___shadow_status" ) @@ -73,4 +74,8 @@ PERFCOUNTER_CPU(write_fault_bail, "sf bailed due to write_fault PERFCOUNTER_CPU(read_fault_bail, "sf bailed due to read_fault") PERFCOUNTER_CPU(exception_fixed, "pre-exception fixed") PERFCOUNTER_CPU(remove_write_access, "calls to remove_write_access") -PERFCOUNTER_CPU(remove_write_access_easy, "easy outs of remove_write_access") +PERFCOUNTER_CPU(remove_write_no_work, "no work in remove_write_access") +PERFCOUNTER_CPU(remove_write_not_writable, "remove_write non-writable page") +PERFCOUNTER_CPU(remove_write_fast_exit, "remove_write hit predicted entry") +PERFCOUNTER_CPU(remove_write_predicted, "remove_write predict hit&exit") +PERFCOUNTER_CPU(remove_write_bad_prediction, "remove_write bad prediction") -- 2.30.2